Main analysis
# ----- SETUP -----
# Set data distribution
dd <- datadist (mi_sum)
options(datadist="dd")
# Seed for all analyses
SEED = 1408
# Save outcome vector
outcome_vector <- mi_sum$unsuccessful
Test for non-linearity
# Model formula with restricted cubic splines
model_form_rcs <- as.formula(unsuccessful ~ hiv + diabetes_yn + rcs(lab_hgb,4) + evertb + xray_cavit + smear_pos + rcs(age,4) + female + rcs(bmi,4) + dishx_any_minus + rcs(educ_years,4) + alcoholhx + drughx + smokhx + non_white)
rcs_model_si <- lrm(model_form_rcs, data=mi_sum, x=TRUE, y=TRUE)
plot(Predict(rcs_model_si, lab_hgb))
plot(Predict(rcs_model_si, age))
plot(Predict(rcs_model_si, bmi))
plot(Predict(rcs_model_si, educ_years))
anova(rcs_model_si)
Wald Statistics Response: unsuccessful
Factor Chi-Square d.f. P
hiv 10.10 1 0.0015
diabetes_yn 10.07 1 0.0015
lab_hgb 12.19 3 0.0068
Nonlinear 0.84 2 0.6578
evertb 0.78 1 0.3761
xray_cavit 0.00 1 0.9676
smear_pos 2.22 1 0.1366
age 8.63 3 0.0347
Nonlinear 6.86 2 0.0325
female 2.02 1 0.1551
bmi 2.68 3 0.4434
Nonlinear 0.19 2 0.9086
dishx_any_minus 0.47 1 0.4910
educ_years 7.15 3 0.0674
Nonlinear 1.72 2 0.4241
alcoholhx 0.29 2 0.8661
drughx 13.55 2 0.0011
smokhx 5.65 2 0.0593
non_white 1.95 1 0.1622
TOTAL NONLINEAR 9.29 8 0.3182
TOTAL 116.35 26 <.0001
Age needs non-linear term. Main analysis will use five age groups, but sensitivity analyses will use spline to compare results.
Full model
# Create age groups
mi_sum <- mi_sum %>%
mutate(age_group = fct_case_when(
age < 25 ~ "<25",
age >= 25 & age <35 ~ "25-35",
age >=35 & age <45 ~ "35-45",
age >=45 & age <55 ~ "45-55",
age >= 55 ~ "55+"
))
# Model variables with outcome variable first
model_vars <- Cs(unsuccessful, hiv, diabetes_yn, lab_hgb, evertb, xray_cavit, smear_pos, age_group, female, bmi, dishx_any_minus, educ_years, alcoholhx, drughx, smokhx, non_white)
# Length vector for formulas later
length_model_vars <- length(model_vars)
# Full model formula
model_formula <- as.formula(paste0("unsuccessful ~", (paste(model_vars[2:length_model_vars], collapse = "+"))))
# Full model
full_model <- lrm(model_formula, data = mi_sum, x = T, y = T)
# Save coefficients
coef_full_model <- full_model$coefficients
# Predicted values from full model
predvals_full_model <- predict(full_model)
# Validation of full model
full_val <- validate(full_model, B=200, seed=SEED)
Performance
Brier score, calibration slope, and calibration intercept of the full model
perf_full_model <- save_val_results(full_val)
perf_full_model
[,1]
Brier score 0.137
Calibration slope 1.000
Calibration intercept 0.000
CI for apparent c-statistic
c_full_model <- c_ci(predvals_full_model, outcome_vector)
c_full_model
c_stat LB UB
[1,] 0.779 0.744 0.812
Internal validation
95% CI for the c-statistic internally validated:
c_stat_ci(full_model, data=mi_sum, samples=2000)
2.5% 97.5%
orig 0.779 0.744 0.814
corrected 0.745 0.710 0.779
More validation measures
full_val
index.orig training test optimism index.corrected n
Dxy 0.5571 0.5879 0.5180 0.0699 0.4872 200
R2 0.2276 0.2592 0.1993 0.0598 0.1678 200
Intercept 0.0000 0.0000 -0.1773 0.1773 -0.1773 200
Slope 1.0000 1.0000 0.8364 0.1636 0.8364 200
Emax 0.0000 0.0000 0.0729 0.0729 0.0729 200
D 0.1550 0.1786 0.1342 0.0444 0.1106 200
U -0.0021 -0.0021 0.0045 -0.0066 0.0045 200
Q 0.1571 0.1807 0.1297 0.0510 0.1061 200
B 0.1367 0.1318 0.1407 -0.0089 0.1456 200
g 1.2213 1.3629 1.1311 0.2318 0.9896 200
gp 0.1713 0.1822 0.1605 0.0217 0.1496 200
Bootstrapped backwards selection
The model selection function below was adapted from code provided by Heinze et al. It first fits the full model, then fits one round of backward selection, and then uses 500 bootstrap samples (sampling with replacement) to estimate selected models and then takes the median coefficient value across those samples.
Variable selection adds uncertainty to regression coefficients, which is quantified by the root mean square difference ratios. Additionally, model based uncertainty of the selected model are falsely precise and do not consider backwards selection.
RMSD ratio (RMSD of bootstrap estimates are divided by the standard error of that coefficient in the global model) quantify the uncertainty of variable selection, and expresses variance inflation or deflation caused by variable selection.
The relative conditional bias estimate quantifies how much variable-selection-induced bias one would have to expect if an IV is selected, and it increases with the uncertainty of selecting each predictor. It is computed as the difference of the mean of sampled regression coefficients computed from those samples where the variable was selected and the global model regression coefficient, divided by the global model regression coefficient. relative conditional bias <10% indicates negligible bias (evidenced by the bootstrap median as close to the global coefficient)
Using bootstrap medians and 95% are corrected for that uncertainty and can be interpreted as 95% CI obtained from sampling-based multi-model inference. The 2.5th and 97.5th percentiles can be interpreted as limits of 95% confidence intervals obtained by sampling-based multi-model inference.
# Set seed
set.seed(SEED)
# Model selection function with 500 repetitions
main_results <- model_selection(model=model_formula, data=mi_sum, boot=500, return="all", pval_pair=0.01, seed=SEED)
Main results
main_results$overview
| Estimate | Standard error | Estimate | Standard error | RMSD ratio | Bootstrap inclusion frequency (%) | Relative conditional bias (%) | Median | 2.5th percentile | 97.5% percentile | |
|---|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.922 | 0.874 | 0.909 | 0.834 | 1.134 | 100.0 | -4.23 | 0.918 | -1.216 | 2.812 |
| lab_hgb | -0.171 | 0.050 | -0.171 | 0.049 | 1.184 | 98.4 | 6.22 | -0.178 | -0.292 | -0.074 |
| hiv | 0.779 | 0.240 | 0.780 | 0.230 | 1.119 | 97.0 | 3.43 | 0.780 | 0.000 | 1.263 |
| drughxFormer | 0.399 | 0.248 | 0.402 | 0.247 | 1.135 | 96.8 | 16.00 | 0.437 | -0.045 | 0.950 |
| drughxCurrent | 1.097 | 0.293 | 1.124 | 0.285 | 1.319 | 96.8 | 9.14 | 1.157 | 0.000 | 1.877 |
| diabetes_yn | 0.696 | 0.212 | 0.690 | 0.210 | 1.204 | 95.6 | 5.22 | 0.701 | 0.000 | 1.175 |
| age_group25-35 | -0.444 | 0.260 | -0.444 | 0.259 | 1.161 | 88.6 | 11.68 | -0.456 | -0.999 | 0.009 |
| age_group35-45 | -0.639 | 0.281 | -0.644 | 0.279 | 1.248 | 88.6 | 12.03 | -0.671 | -1.280 | 0.000 |
| age_group45-55 | -1.052 | 0.337 | -1.063 | 0.335 | 1.477 | 88.6 | 11.00 | -1.102 | -1.874 | 0.000 |
| age_group55+ | -0.357 | 0.337 | -0.433 | 0.329 | 1.214 | 88.6 | 16.33 | -0.353 | -1.224 | 0.342 |
| educ_years | -0.056 | 0.024 | -0.055 | 0.024 | 1.356 | 83.0 | 20.91 | -0.060 | -0.110 | 0.000 |
| smokhxFormer | 0.580 | 0.239 | 0.590 | 0.229 | 1.334 | 82.4 | 18.60 | 0.616 | 0.000 | 1.074 |
| smokhxCurrent | 0.462 | 0.266 | 0.484 | 0.257 | 1.241 | 82.4 | 20.42 | 0.492 | 0.000 | 1.136 |
| female | -0.329 | 0.215 | -0.356 | 0.213 | 1.256 | 60.8 | 51.13 | -0.368 | -0.780 | 0.000 |
| bmi | -0.045 | 0.030 | -0.043 | 0.029 | 1.340 | 57.4 | 48.67 | -0.048 | -0.109 | 0.000 |
| smear_pos | 0.364 | 0.252 | 0.370 | 0.247 | 1.252 | 53.0 | 53.87 | 0.379 | 0.000 | 0.841 |
| non_white | 0.351 | 0.261 | 0.392 | 0.257 | 1.282 | 52.0 | 73.55 | 0.390 | 0.000 | 0.991 |
| evertb | -0.240 | 0.259 | 0.000 | 0.000 | 1.139 | 34.4 | 131.84 | 0.000 | -0.833 | 0.000 |
| dishx_any_minus | -0.210 | 0.287 | 0.000 | 0.000 | 1.179 | 34.2 | 162.40 | 0.000 | -0.882 | 0.408 |
| alcoholhxFormer | 0.136 | 0.337 | 0.000 | 0.000 | 0.903 | 22.4 | 269.12 | 0.000 | -0.248 | 0.931 |
| alcoholhxCurrent | 0.175 | 0.338 | 0.000 | 0.000 | 0.945 | 22.4 | 249.93 | 0.000 | 0.000 | 1.018 |
| xray_cavit | -0.020 | 0.193 | 0.000 | 0.000 | 0.866 | 18.0 | -348.43 | 0.000 | -0.385 | 0.414 |
Frequnecy table
The table below shows the frequency of how often models were selected during the bootstrap sampling. The boot50 model (including variables selected in at least 50% of bootstrap samples) was selected <2% of the time. Tied for most selected model. Shows variability of resampling procedure.
main_results$freq_table
| Predictors | count | percent | cum_percent | |
|---|---|---|---|---|
| 51 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female smear_pos non_white | 9 | 1.8 | 1.8 |
| 60 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos non_white | 9 | 1.8 | 3.6 |
| 16 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female smear_pos | 8 | 1.6 | 5.2 |
| 33 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female non_white | 8 | 1.6 | 6.8 |
| 44 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi non_white | 8 | 1.6 | 8.4 |
| 5 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female | 7 | 1.4 | 9.8 |
| 25 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos | 6 | 1.2 | 11.0 |
| 107 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos non_white evertb | 6 | 1.2 | 12.2 |
| 125 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female smear_pos dishx_any_minus | 6 | 1.2 | 13.4 |
| 138 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi non_white dishx_any_minus | 6 | 1.2 | 14.6 |
| 7 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi | 5 | 1.0 | 15.6 |
| 12 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi | 5 | 1.0 | 16.6 |
| 20 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi smear_pos | 4 | 0.8 | 17.4 |
| 39 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi non_white | 4 | 0.8 | 18.2 |
| 47 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent smear_pos non_white | 4 | 0.8 | 19.0 |
| 77 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi smear_pos evertb | 4 | 0.8 | 19.8 |
| 89 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi non_white evertb | 4 | 0.8 | 20.6 |
| 106 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ smokhxFormer smokhxCurrent female bmi smear_pos non_white evertb | 4 | 0.8 | 21.4 |
| 173 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos non_white evertb dishx_any_minus | 4 | 0.8 | 22.2 |
| 224 | lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi dishx_any_minus alcoholhxFormer alcoholhxCurrent | 4 | 0.8 | 23.0 |
Pairwise table
The table below informs us about the variables that are often included together (“rope teams”) or variables that may be substituted for one another (“competitors”).
- A “>” sign indicates that variables are selected together more often than would be expected by chance by multiplying their individual selection probabilities
- A “<” sign indicates that the variables were selected together less often than would be expected by chance.
main_results$pair_table
| lab_hgb | hiv | drughxFormer | drughxCurrent | diabetes_yn | age_group25-35 | age_group35-45 | age_group45-55 | age_group55+ | educ_years | smokhxFormer | smokhxCurrent | female | bmi | smear_pos | non_white | evertb | dishx_any_minus | alcoholhxFormer | alcoholhxCurrent | xray_cavit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| lab_hgb | 98.4 | 95.4 | 95.2 | 95.2 | 94 | 87.4 | 87.4 | 87.4 | 87.4 | 81.6 | 81 | 81 | 60.4 | 55.8 | 52.4 | 50.8 | 33.6 | 33.4 | 22.2 | 22.2 | 18 |
| hiv | 97 | 93.8 | 93.8 | 92.6 | 85.6 | 85.6 | 85.6 | 85.6 | 80.6 | 80.2 | 80.2 | 58.2 | 56 | 51.8 | 50.8 | 33.4 | 33.6 | 21.4 | 21.4 | 17.6 | |
| drughxFormer | 96.8 | 96.8 | 92.4 | 85.4 | 85.4 | 85.4 | 85.4 | 79.8 | 79.2 | 79.2 | 58.2 | 55.8 | 51 | 50.8 | 32.8 | 32.8 | 21.4 | 21.4 | 17.6 | ||
| drughxCurrent | > | 96.8 | 92.4 | 85.4 | 85.4 | 85.4 | 85.4 | 79.8 | 79.2 | 79.2 | 58.2 | 55.8 | 51 | 50.8 | 32.8 | 32.8 | 21.4 | 21.4 | 17.6 | ||
| diabetes_yn | 95.6 | 84.8 | 84.8 | 84.8 | 84.8 | 79.4 | 79 | 79 | 57.6 | 56 | 50.6 | 49.4 | 32.6 | 33.2 | 21.8 | 21.8 | 17.6 | ||||
| age_group25-35 | 88.6 | 88.6 | 88.6 | 88.6 | 75.4 | 74.2 | 74.2 | 54.8 | 48.2 | 47.6 | 46.6 | 30 | 32 | 19.6 | 19.6 | 15 | |||||
| age_group35-45 | > | 88.6 | 88.6 | 88.6 | 75.4 | 74.2 | 74.2 | 54.8 | 48.2 | 47.6 | 46.6 | 30 | 32 | 19.6 | 19.6 | 15 | |||||
| age_group45-55 | > | > | 88.6 | 88.6 | 75.4 | 74.2 | 74.2 | 54.8 | 48.2 | 47.6 | 46.6 | 30 | 32 | 19.6 | 19.6 | 15 | |||||
| age_group55+ | > | > | > | 88.6 | 75.4 | 74.2 | 74.2 | 54.8 | 48.2 | 47.6 | 46.6 | 30 | 32 | 19.6 | 19.6 | 15 | |||||
| educ_years | > | < | < | < | 83 | 67.6 | 67.6 | 49.8 | 46 | 44.4 | 42.4 | 29.2 | 28.4 | 18.6 | 18.6 | 15 | |||||
| smokhxFormer | 82.4 | 82.4 | 48.8 | 48.4 | 43.6 | 41.6 | 28.4 | 28.2 | 16.8 | 16.8 | 14.4 | ||||||||||
| smokhxCurrent | > | 82.4 | 48.8 | 48.4 | 43.6 | 41.6 | 28.4 | 28.2 | 16.8 | 16.8 | 14.4 | ||||||||||
| female | 60.8 | 33.2 | 35 | 31.6 | 18.2 | 20.2 | 10.4 | 10.4 | 11.4 | ||||||||||||
| bmi | > | < | < | < | 57.4 | 29 | 30.4 | 20.4 | 19.2 | 12 | 12 | 10 | |||||||||
| smear_pos | 53 | 28.4 | 18.4 | 17.8 | 10.2 | 10.2 | 10.6 | ||||||||||||||
| non_white | 52 | 18.2 | 14 | 10.6 | 10.6 | 8.2 | |||||||||||||||
| evertb | 34.4 | 12.4 | 8.2 | 8.2 | 6.4 | ||||||||||||||||
| dishx_any_minus | < | 34.2 | 7.4 | 7.4 | 6.4 | ||||||||||||||||
| alcoholhxFormer | < | 22.4 | 22.4 | 4.6 | |||||||||||||||||
| alcoholhxCurrent | < | < | 22.4 | 4.6 | |||||||||||||||||
| xray_cavit | 18 |
# Save coefficients from selected model and boot50 model
coef_selected_model <- main_results$raw_overview[,"sel_est"]
coef_boot_median <- main_results$raw_overview[,"boot_median"]
Boot50
The main (“final”) model, based on variables that were selected in at least 50% of bootstrap samples.
The c-statistic of this model is:
# Extract names of variables retained in boot50 model
boot_vars_int <- names(coef_boot_median[which(coef_boot_median!=0)])
boot_vars_int <- gsub("Former|Never|Current|25-35|35-45|45-55|55+", "", boot_vars_int)
boot_vars_int <- unique(boot_vars_int)
boot_vars <- boot_vars_int[-1]
# Boot50 formula
boot_form <- as.formula(paste0("unsuccessful ~", (paste(boot_vars, collapse = "+"))))
# Boot50 model
boot_model <- lrm(boot_form, data = mi_sum, x = T, y = T)
# Boot50 validation
boot_val <- validate(boot_model, B=200, seed=SEED)
# Coefficients from the model refit including only variables from the 50% or more boostraps
coef_boot50_vars <- boot_model$coefficients
Performance
Brier score, calibration slope, and calibration intercept of the full model:
perf_boot50_model <- save_val_results(boot_val)
perf_boot50_model
[,1]
Brier score 0.137
Calibration slope 1.000
Calibration intercept 0.000
CI for apparent c-statistic
# Predicted values
predvals_boot50 <- predict(boot_model)
mi_sum$predvals_boot50 <- predvals_boot50
c_boot50_model <- c_ci(predvals_boot50, outcome_vector)
c_boot50_model
c_stat LB UB
[1,] 0.778 0.74 0.812
Internal validation
c_stat_ci(boot_model, data=mi_sum, samples=2000)
2.5% 97.5%
orig 0.778 0.742 0.812
corrected 0.754 0.720 0.789
Calibration plot
The calibration curve from Frank Harrell’s package is:
# Calibration curve
plot(calibrate(boot_model, B=200), xlab="Predicted Probability of Unsuccessful Outcome", ylab="Actual Proability of Unsuccessful Outcome")
n=944 Mean absolute error=0.022 Mean squared error=0.00138
0.9 Quantile of absolute error=0.031
I also wrote a code that constructs a calibration plot, but does not include bias-corrected line.
# My functions for calibration plot
cal_plot_solo(mi_sum, predvals_boot50, outcome_vector)
More validation measures
boot_val
index.orig training test optimism index.corrected n
Dxy 0.5566 0.5770 0.5271 0.0499 0.5067 200
R2 0.2250 0.2490 0.2041 0.0450 0.1801 200
Intercept 0.0000 0.0000 -0.1420 0.1420 -0.1420 200
Slope 1.0000 1.0000 0.8749 0.1251 0.8749 200
Emax 0.0000 0.0000 0.0560 0.0560 0.0560 200
D 0.1531 0.1716 0.1377 0.0339 0.1192 200
U -0.0021 -0.0021 0.0026 -0.0047 0.0026 200
Q 0.1552 0.1737 0.1351 0.0386 0.1166 200
B 0.1369 0.1341 0.1400 -0.0060 0.1429 200
g 1.2053 1.3066 1.1365 0.1701 1.0352 200
gp 0.1703 0.1796 0.1623 0.0173 0.1530 200
ROC curve
# Predicted risk
predrisk_boot50 <- 1/(1+exp(-predvals_boot50))
mi_sum$predrisk_boot50 = predrisk_boot50
# ROC curve
roc <- roc(mi_sum, unsuccessful, predrisk_boot50, ci=TRUE)
roc
Call:
roc.data.frame(data = mi_sum, response = unsuccessful, predictor = predrisk_boot50, ci = TRUE)
Data: predrisk_boot50 in 753 controls (unsuccessful 0) < 191 cases (unsuccessful 1).
Area under the curve: 0.778
95% CI: 0.743-0.813 (DeLong)
plot(roc)
plot(ci.thresholds(roc), type="shape", col="lightblue")
plot(roc, add=TRUE)
Hosmer-Lemeshow test
hoslem.test(outcome_vector, predrisk_boot50, g=20)
Hosmer and Lemeshow goodness of fit (GOF) test
data: outcome_vector, predrisk_boot50
X-squared = 19, df = 18, p-value = 0.4
Shrinkage from boot50 model
Below I use the variables from the boot50 model and model fit to estimate the heuristic shrinkage factor.
A heuristic shrinkage factor is suggested to improve predictions in new patients, and can be estimated as: \[\chi^2_{model} - df / \chi^2_{model}\]
# Heuristic shrinkage factor
shrinkage_factor <- (boot_model$stats["Model L.R."] - boot_model$stats["d.f."]) / (boot_model$stats["Model L.R."])
shrinkage_factor
Model L.R.
0.89
# Adjust coefficients from boot50 variable model by shrinkage factor
coef_shrink_model <- coef_boot50_vars*shrinkage_factor
# Predicted values and predicted from boot_model*shrinkage factor
predvals_shrink <- predict(boot_model)*shrinkage_factor
predrisk_shrink <- 1/(1+exp(-predvals_shrink))
mi_sum$predrisk_shrink = predrisk_shrink
mi_sum$predvals_shrink = predvals_shrink
Performance
Brier score, calibration slope, and calibration intercept of the shrunken model:
shrink_perf <- ev_glm(method="original", lp="predvals_shrink", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")
brier <- shrink_perf["Brier", "apparent"]
slope <- shrink_perf["Slope", "apparent"]
intercept <- shrink_perf["Intercept", "apparent"]
perf_shrink<- rbind("Brier score" = brier, "Calibration slope" = slope, "Calibration intercept" = intercept)
perf_shrink
[,1]
Brier score 0.137
Calibration slope 1.124
Calibration intercept 0.000
CI for apparent c-statistic
c_shrink <- c_ci(predvals_shrink, outcome_vector)
c_shrink
c_stat LB UB
[1,] 0.778 0.744 0.812
Calibration plot
cal_plot_solo(mi_sum, predvals_shrink, outcome_vector)
ROC curve
plot(ev_glm(method="original", lp="predvals_shrink", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))
This is similar to the ROC curve from the non-shrunk values (coefficients from boot50 vars model)
Bootstrap medians
This section uses the median coefficients estimated from the bootstrap sampling to evaluate model performance.
## Boot medians
overview <- as.data.frame(main_results$raw_overview) %>% rownames_to_column("variable")
# Select non-zero values
boot_medians <- overview %>%
filter(boot_median!=0) %>%
select(variable, boot_median)
# Apply formula
mi_sum <- mi_sum %>%
mutate(predvals_boot_median =
0.9182 +
-0.1783*lab_hgb +
0.7803*hiv +
0.4369*(drughx=="Former") +
1.157*(drughx=="Current") +
0.7006*diabetes_yn +
-0.4557*(age_group=="25-35") +
-0.6709*(age_group=="35-45") +
-1.1024*(age_group=="45-55") +
-0.3533*(age_group=="55+") +
-0.0598*educ_years +
0.6162*(smokhx=="Former") +
0.4924*(smokhx=="Current") +
-0.0477*bmi +
-0.3680*female +
0.3792*smear_pos +
0.3895*non_white,
predrisk_boot_median = 1/(1+exp(-predvals_boot_median))
)
mi_sum %>%
select(predvals_boot_median, predrisk_boot_median, unsuccessful) %>%
tbl_summary(by="unsuccessful")
| Characteristic | 0, N = 7531 | 1, N = 1911 |
|---|---|---|
| predvals_boot_median | -2.18 (-2.78, -1.40) | -1.04 (-1.68, -0.44) |
| predrisk_boot_median | 0.10 (0.06, 0.20) | 0.26 (0.16, 0.39) |
|
1
Statistics presented: median (IQR)
|
||
Performance
boot_med_perf <- ev_glm(method="original", lp="predvals_boot_median", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")
brier <- boot_med_perf["Brier", "apparent"]
slope <- boot_med_perf["Slope", "apparent"]
intercept <- boot_med_perf["Intercept", "apparent"]
perf_boot_median <- rbind("Brier score" = brier, "Calibration slope" = slope, "Calibration intercept" = intercept)
perf_boot_median
[,1]
Brier score 0.138
Calibration slope 0.966
Calibration intercept 0.123
CI for apparent c-statistic
predvals_boot_median = mi_sum$predvals_boot_median
c_boot_median <- c_ci(predvals_boot_median, outcome_vector)
c_boot_median
c_stat LB UB
[1,] 0.777 0.742 0.81
Calibration plot
cal_plot_solo(mi_sum, predvals_boot_median, outcome_vector)
ROC curve
plot(ev_glm(method="original", lp="predvals_boot_median", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))
LASSO
LASSO with glmnet - lambda minimum and 1se were estimated using 10-fold cross validation.
# Matrix of x variable values
x_vars <- model.matrix(model_formula, mi_sum)[,-1]
# Fit lassso
lasso_fit <- glmnet(x_vars, outcome_vector, family="binomial", alpha=1, seed=SEED)
# Cross validation to select optimal lambda
cv_lasso <- cv.glmnet(x_vars, outcome_vector, family = "binomial", type.measure="auc", nfolds=10, seed=SEED)
plot(cv_lasso)
Lambda min
Using lambda min, almost all variables from the full model were included:
coef_lasso_min <- coef(cv_lasso, s="lambda.min")
# Predicted risk and values
predrisk_lasso_min <- cv_lasso %>% predict(newx = x_vars, s="lambda.min", type="response", seed=SEED)
predrisk_lasso_min_vec = as.vector(predrisk_lasso_min)
mi_sum$predrisk_lasso_min <- predrisk_lasso_min_vec
predvals_lasso_min <- cv_lasso %>% predict(newx = x_vars, s="lambda.min", seed=SEED)
predvals_lasso_min_vec = as.vector(predvals_lasso_min)
mi_sum$predvals_lasso_min <- predvals_lasso_min_vec
# Predicted outcomes lasso min
predout_lasso_min <- ifelse(predrisk_lasso_min > 0.5, 1, 0)
Performance
The lambda min model had an accuracy (defined as the concordance between predicted outcome - predicted risk >0.5 = outcome, predicted risk <=0.5 = no outcome vs. the true values of the outcome) of:
mean(predout_lasso_min == outcome_vector)
[1] 0.798
Brier score, calibration slope, and calibration intercept of the model:
lasso_min_perf <- ev_glm(method="original", lp="predvals_lasso_min", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")
brier <- lasso_min_perf["Brier", "apparent"]
slope <- lasso_min_perf["Slope", "apparent"]
intercept <- lasso_min_perf["Intercept", "apparent"]
perf_lasso_min <- rbind("Brier score" = brier, "Calibration slope" = slope, "Calibration intercept" = intercept)
perf_lasso_min
[,1]
Brier score 0.137
Calibration slope 1.021
Calibration intercept 0.024
CI for apparent c-statistic
c_lasso_min <- c_ci(predvals_lasso_min_vec, outcome_vector)
c_lasso_min
c_stat LB UB
[1,] 0.778 0.743 0.811
Calibration plot
And here is the calibration curve and c-statistic:
val.prob(predrisk_lasso_min, outcome_vector, cex=.5)
Dxy C (ROC) R2 D D:Chi-sq D:p U U:Chi-sq
0.55646 0.77823 0.22749 0.15489 147.22043 NA -0.00207 0.04736
U:p Q Brier Intercept Slope Emax E90 Eavg
0.97660 0.15696 0.13662 0.02371 1.02056 0.16528 0.03948 0.02230
S:z S:p
0.17367 0.86213
ROC curve
plot(ev_glm(method="original", lp="predvals_lasso_min", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))
Lambda 1se
coef_lasso_1se <- coef(cv_lasso, s="lambda.1se")
# Predicted value sand risks
predrisk_lasso_1se <- cv_lasso %>% predict(newx = x_vars, s="lambda.1se", type="response", seed=SEED)
predrisk_lasso_1se_vec = as.vector(predrisk_lasso_1se)
mi_sum$predrisk_lasso_1se <- predrisk_lasso_1se_vec
predvals_lasso_1se <- cv_lasso %>% predict(newx = x_vars, s="lambda.1se", seed=SEED)
predvals_lasso_1se_vec = as.vector(predvals_lasso_1se)
mi_sum$predvals_lasso_1se <- predvals_lasso_1se_vec
# Predicted outcome
predout_lasso_1se <- ifelse(predrisk_lasso_1se > 0.5, 1, 0)
Performance
The lambda min model had an accuracy (defined as the concordance between predicted outcome - predicted risk >0.5 = outcome, predicted risk <=0.5 = no outcome vs. the true values of the outcome) of:
mean(predout_lasso_1se == outcome_vector)
[1] 0.8
Brier score, calibration slope, and calibration intercept of the full model:
lasso_1se_perf <- ev_glm(method="original", lp="predvals_lasso_1se", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")
brier <- lasso_1se_perf["Brier", "apparent"]
slope <- lasso_1se_perf["Slope", "apparent"]
intercept <- lasso_1se_perf["Intercept", "apparent"]
perf_lasso_1se <- rbind("Brier score" = brier, "Calibration slope" = slope, "Calibration intercept" = intercept)
perf_lasso_1se
[,1]
Brier score 0.145
Calibration slope 1.786
Calibration intercept 1.002
CI for apparent c-statistic
c_lasso_1se <- c_ci(predvals_lasso_1se_vec, outcome_vector)
c_lasso_1se
c_stat LB UB
[1,] 0.744 0.705 0.782
Calibration plot
And here is the calibration curve and c-statistic:
val.prob(predrisk_lasso_1se, outcome_vector, cex=.5)
Dxy C (ROC) R2 D D:Chi-sq D:p U U:Chi-sq
4.88e-01 7.44e-01 1.46e-01 9.63e-02 9.19e+01 NA 1.98e-02 2.07e+01
U:p Q Brier Intercept Slope Emax E90 Eavg
3.27e-05 7.65e-02 1.45e-01 1.00e+00 1.79e+00 1.53e-01 1.02e-01 5.55e-02
S:z S:p
-1.28e+00 2.00e-01
ROC curve
plot(ev_glm(method="original", lp="predvals_lasso_1se", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))
Model approximation
The data below shows the steps of removal of variables from the full model. This is the first step to model approximation. Model all variables against the predicted values from the full model, and then can estimate how well the set of variables predicted the predicted values from the full model.
approx_full_formula <- as.formula(paste0("predvals_full_model ~", (paste(model_vars[2:length_model_vars], collapse = "+"))))
approx_full_model <- ols(approx_full_formula, sigma=1, data=mi_sum)
fastbw(approx_full_model, aics=10000)
Deleted Chi-Sq d.f. P Residual d.f. P AIC R2
xray_cavit 0.08 1 0.7770 0.08 1 0.7770 -1.92 1.000
alcoholhx 2.86 2 0.2388 2.94 3 0.4003 -3.06 0.997
dishx_any_minus 5.37 1 0.0204 8.32 4 0.0806 0.32 0.993
evertb 7.09 1 0.0077 15.41 5 0.0087 5.41 0.986
bmi 18.41 1 0.0000 33.82 6 0.0000 21.82 0.970
smear_pos 18.50 1 0.0000 52.32 7 0.0000 38.32 0.953
non_white 26.00 1 0.0000 78.31 8 0.0000 62.31 0.930
female 30.30 1 0.0000 108.61 9 0.0000 90.61 0.903
educ_years 52.73 1 0.0000 161.34 10 0.0000 141.34 0.856
hiv 61.26 1 0.0000 222.59 11 0.0000 200.59 0.801
diabetes_yn 72.61 1 0.0000 295.21 12 0.0000 271.21 0.736
age_group 66.46 4 0.0000 361.67 16 0.0000 329.67 0.676
smokhx 95.49 2 0.0000 457.16 18 0.0000 421.16 0.591
lab_hgb 250.25 1 0.0000 707.41 19 0.0000 669.41 0.367
drughx 410.34 2 0.0000 1117.75 21 0.0000 1075.75 0.000
Approximate Estimates after Deleting Factors
Coef S.E. Wald Z P
[1,] -1.671 0.03255 -51.34 0
Factors in Final Model
None
We see that the variables through age can predict the full moel with an R2 of 0.986.
The code below confirms how well the boot50 model approximates the predicted values from the full model.
approx_boot_formula = as.formula(paste0("predvals_full_model ~ ", (paste(boot_vars, collapse = "+"))))
approx_boot_model <- ols(approx_boot_formula, data=mi_sum, x=TRUE, y=TRUE)
coef_approx_model = coef(approx_boot_model)
approx_boot_model$stats["R2"]
R2
0.986
The mean absolute error between predicted values from the approximate model and predicted values from the full model is:
predvals_approx_mod <- predict(approx_boot_model)
mi_sum$predvals_approx_mod <- predvals_approx_mod
# Absolute error between approximated model predicted values and full model predicted values
abs_err <- mean(abs(predvals_approx_mod - predvals_full_model))
abs_err
[1] 0.105
Performance
Brier score, calibration slope, and calibration intercept of the model:
approx_perf <- ev_glm(method="original", lp="predvals_approx_mod", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")
brier <- approx_perf["Brier", "apparent"]
slope <- approx_perf["Slope", "apparent"]
intercept <- approx_perf["Intercept", "apparent"]
perf_approx_model <- rbind("Brier score" = brier, "Calibration slope" = slope, "Calibration intercept" = intercept)
perf_approx_model
[,1]
Brier score 0.137
Calibration slope 0.994
Calibration intercept -0.004
CI for apparent c-statistic:
c_approx_model <- c_ci(predvals_approx_mod, outcome_vector)
c_approx_model
c_stat LB UB
[1,] 0.778 0.742 0.813
Calibration plot
cal_plot_solo(mi_sum, predvals_approx_mod, outcome_vector)
ROC curve
plot(ev_glm(method="original", lp="predvals_approx_mod", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))
Model comparison
Coefficients
# Extract names of variables retained in selected model
selected_vars_int <- names(coef_selected_model[which(coef_selected_model!=0)])
selected_vars_int <- gsub("Former|Never|Current|25-35|35-45|45-55|55+", "", selected_vars_int)
selected_vars_int <- unique(selected_vars_int)
selected_vars <- selected_vars_int[-1]
selected_form <- as.formula(paste0("unsuccessful ~", (paste(selected_vars, collapse = "+"))))
selected_model <- lrm(selected_form, data = mi_sum, x = T, y = T)
predvals_selected_model <- predict(selected_model)
# Model performance selected model - don't need to output because its the same as the boot50 model, but want to store for table
c_selected_model <- c_ci(predvals_selected_model, outcome_vector)
selected_model_val <- validate(selected_model)
perf_selected_model <- save_val_results(selected_model_val)
# Extract coefficients from each modeling approach and compare
coef_full <- data.frame(coef_full_model) %>% rownames_to_column("variable") %>% rename(full_model = coef_full_model) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable))
coef_lasso_min <- data.frame(as.array(coef_lasso_min)) %>% rownames_to_column("variable") %>% rename(lasso_min = X1) %>%
mutate(variable = gsub("[(]|[)]|=|rcsage, 4", "", variable),
variable = gsub("rcsage, 4", "", variable))
coef_lasso_1se <- data.frame(as.array(coef_lasso_1se)) %>% rownames_to_column("variable") %>% rename(lasso_1se = X1) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable),
variable = gsub("rcsage, 4", "", variable))
coef_selected <- data.frame(coef_selected_model) %>% rownames_to_column("variable") %>% rename(selected_model = coef_selected_model) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable),
variable = gsub("rcsage, 4", "", variable))
coef_median <- data.frame(coef_boot_median) %>% rownames_to_column("variable") %>% rename(boot_median = coef_boot_median) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable),
variable = gsub("rcsage, 4", "", variable))
coef_boot50_model <- data.frame(coef_boot50_vars) %>% rownames_to_column("variable") %>% rename(boot50_model = coef_boot50_vars) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable))
coef_shrink <- data.frame(coef_shrink_model) %>% rownames_to_column("variable") %>% rename(coef_shrink = coef_shrink_model) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable))
coef_approx <- data.frame(coef_approx_model) %>% rownames_to_column("variable") %>% rename(approx_model = coef_approx_model) %>%
mutate(variable = gsub("[(]|[)]|=", "", variable))
var_order <- coef_median %>% pull(variable)
coefficients_table <- Reduce(function(x,y) merge(x, y, by = "variable", all.x = TRUE, all.y = TRUE),
list(coef_full, coef_selected, coef_median, coef_boot50_model, coef_shrink, coef_approx, coef_lasso_min, coef_lasso_1se)) %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
arrange(factor(variable, var_order))
coefficients_table
Performance measures
# -- C statistics ---
c_stat_list <- as.data.frame(rbind(c_full_model, c_selected_model, c_boot_median, c_boot50_model, c_shrink, c_approx_model, c_lasso_min, c_lasso_1se))
rownames(c_stat_list) <- c("full_model", "selected_model" , "boot_median", "boot50_model", "shrink", "approx", "lambda_min", "lambda_1se")
c_stats <- c_stat_list %>%
rownames_to_column("model") %>%
mutate_if(is.numeric, round, 2) %>%
mutate(c_ci = paste(c_stat, " (", LB, ", ", UB, ")", sep="")) %>%
select(model, c_ci)
#--- Brier score, slope, and intercept ---
performance_list <- as.data.frame(t(cbind(perf_full_model, perf_selected_model, perf_boot_median, perf_boot50_model, perf_shrink, perf_approx_model, perf_lasso_min, perf_lasso_1se)))
rownames(performance_list) <- c("full_model", "selected_model", "boot_median", "boot50_model", "shrink", "approx", "lambda_min", "lambda_1se")
performance <- performance_list %>%
rownames_to_column("model") %>%
rename(brier_score = "Brier score",
cal_slope = "Calibration slope",
cal_int = "Calibration intercept")
#---- Combine data ---
models_performance <- performance %>%
full_join(c_stats, by="model")
models_performance
# Transpose
t_model_performance <- as.data.frame(t(models_performance)) %>% rownames_to_column("stat")
Main results
Classification table
Based on coefficients from the boot50 model, here is a classification table broken down by deciles. Each row denotes the classification values for people above that decile are considered as “unsuccessful” (positive) outcome vs. “successful” (negative) outcome.
# Positive = unsuccessful
# Negative = successful
n_total = nrow(mi_sum)
all_positive = nrow(filter(mi_sum, unsuccessful==1))
all_negative = nrow(filter(mi_sum, unsuccessful==0))
classification_table <- mi_sum %>%
mutate(risk_group = ntile(predvals_boot50, 10)) %>%
group_by(risk_group) %>%
summarize(
n_group = n(),
outcome = sum(unsuccessful),
no_outcome = n_group-outcome,
obs_prob_outcome = mean(unsuccessful),
pred_prob_outcome = mean(predvals_boot50),
min_pred_val = min(predvals_boot50),
max_pred_val = max(predvals_boot50)) %>%
mutate(n_below_cut_off = cumsum(n_group),
perc_below_cut_off = n_below_cut_off/n_total,
cm_outcome = cumsum(outcome),
cm_no_outcome =cumsum(no_outcome),
tp = all_positive - cm_outcome,
tn = cm_no_outcome,
fp = all_negative - cm_no_outcome,
fn = cm_outcome,
sensitivity = tp/(tp+fn),
specificity = tn/(tn+fp),
ppv = tp/(tp+fp),
npv = tn/(tn+fn)) %>%
select(risk_group, max_pred_val, n_below_cut_off, perc_below_cut_off, cm_outcome, cm_no_outcome, tp, tn, fp, fn, sensitivity, specificity, ppv, npv)
classification_table
Nomogram
# Set data distribution
dd <- datadist (mi_sum)
options(datadist="dd")
boot_model$coefficients
Intercept lab_hgb hiv drughx=Former drughx=Current
0.9089 -0.1709 0.7802 0.4017 1.1243
diabetes_yn age_group=25-35 age_group=35-45 age_group=45-55 age_group=55+
0.6902 -0.4444 -0.6439 -1.0635 -0.4327
educ_years smokhx=Former smokhx=Current female bmi
-0.0547 0.5904 0.4841 -0.3564 -0.0434
smear_pos non_white
0.3702 0.3924
boot_model <- Newlabels(boot_model, c("Hemoglobin (g/dL)", "HIV", "Drug use", "Diabetes", "Age group", "Education (years)", "Tobacco use", "Female sex", "BMI", "Smear positive", "Non-white"))
nom <- nomogram(boot_model,
fun=plogis,
fun.at=c(.001,.01,.05,seq(.1,.9,by=.1),.95,.99,.999),
funlabel="Risk of unsuccessful outcome",
vnames="labels")
plot(nom)
Risk plot
pred_vals <- predvals_boot50
graph_data <- mi_sum %>%
mutate(pred_vals = ifelse(pred_vals < -4, -4, pred_vals), # Censor extreme observations
pred_vals = ifelse(pred_vals > 1.5, 1.5, pred_vals), # Censor extreme observations
risk_group = cut(pred_vals,
breaks = 15)) %>%
group_by(risk_group) %>%
summarize(outcome_prob = mean(unsuccessful),
count = n(),
min_pred_val = min(pred_vals),
max_pred_val = max(pred_vals),
mean_pred_val = mean(pred_vals))
scaleFactor <- max(graph_data$count) / max(graph_data$outcome_prob)
text_size = 15
graph <- graph_data %>%
ggplot(aes(x=mean_pred_val, y=count)) +
geom_bar(stat="identity") +
geom_line(aes(y=outcome_prob*scaleFactor), color="tomato1", size=1) +
scale_y_continuous("Participant count", expand=c(0,0), sec.axis = sec_axis(~./scaleFactor, name = "Outcome probability (%)")) +
labs(x="Predicted value") +
theme(
axis.line.y.right = element_line(color = "tomato1"),
axis.ticks.y.right = element_line(color = "tomato1"),
axis.title.y.right=element_text(color="tomato1", size=text_size),
axis.title.y.left=element_text(size=text_size),
axis.title.x = element_text(size=text_size),
axis.text.y.right=element_text(color="tomato1", size=12),
axis.text.y.left = element_text(size=12),
axis.text.x = element_text(size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"))
graph
#---- Save all results to Excel -----
coefficients <- coefficients_table
performance <- t_model_performance
# Create a blank workbook
library(openxlsx)
results <- createWorkbook()
# Add some sheets to the workbook
addWorksheet(results, "Model-Selection")
addWorksheet(results, "Coefficients")
addWorksheet(results, "Performance")
addWorksheet(results, "Classification-Table")
# Write the data to the sheets
writeData(results, sheet = "Model-Selection", x = overview)
writeData(results, sheet = "Coefficients", x = coefficients)
writeData(results, sheet = "Performance", x = performance)
writeData(results, sheet = "Classification-Table", x=classification_table)
# Export the file
saveWorkbook(results, here("Manuscript", "tables", "main_analyses_results.xlsx"), overwrite=TRUE)